Sprite 1984

home *** CD-ROM | disk | FTP | other *** search

/ Sprite 1984 - 1993 / Sprite 1984 - 1993.iso / src / kernel / rpc / rpcRecovery.c.old < prev next >

Wrap

Text File | 1992-12-18 | 34KB | 1,205 lines

/* * rpcRecovery.c -- * * The routines here maintain up/down state about other hosts. * Recovery actions that are registered via Rpc_HostNotify are * called-back when a host crashes and when it reboots. * Regular message traffic plus explicit pings are used to determine * the state of other hosts. The main external procedures are * Rpc_HostIsDown, used to query the state of another host, and * Rpc_WaitForHost, used to block a process until a host reboots. * (Rpc_WaitForHost isn't used much. Instead, modules rely on the * recovery callbacks to indicate that a host is back to life, and * they block processes in their own way.) * * One impact of these recovery hooks on the RPC system is that service * requests from a client that is just rebooting are blocked until * the recovery actions complete. * * Copyright 1987 Regents of the University of California * All rights reserved. */ #ifndef lint static char rcsid[] = "$Header: /cdrom/src/kernel/Cvsroot/kernel/rpc/rpcRecovery.c.old,v 9.0 89/09/12 15:18:27 douglis Stable $ SPRITE (Berkeley)"; #endif not lint #include "sprite.h" #include "rpc.h" #include "rpcInt.h" #include "sync.h" #include "hash.h" #include "mem.h" #include "trace.h" /* * The state of other hosts is kept in a hash table keyed on SpriteID. * This state is maintained by RpcHostAlive and RpcHostDead, which are * called in turn after packet reception or RPC timeout, respectively. * RpcHostDead is also called by the Rpc_Daemon if it can't get an * explicit acknowledgment from a client. */ static Hash_Table recoveryHashTableStruct; static Hash_Table *recovHashTable = &recoveryHashTableStruct; typedef struct RecovHostState { int state; /* flags defined below */ int clientState; /* flags defined in rpc.h */ int spriteID; /* Sprite Host ID */ int bootID; /* Timestamp from RPC header */ Time time; /* Time of last message */ Sync_Condition alive; /* Notified when host comes up */ Sync_Condition recovery; /* Notified when recovery is complete */ } RecovHostState; /* * Access to the hash table is monitored. */ Sync_Lock rpcRecoveryLock; #define LOCKPTR (&rpcRecoveryLock) /* * Host state: * RECOV_STATE_UNKNOWN Initial state. * RECOV_HOST_ALIVE Set when we receive a message from the host * RECOV_HOST_DEAD Set when an RPC times out. * * RECOV_CRASH_CALLBACKS Set during the crash call-backs, this is used * to block RPC server processes until the * crash recovery actions have completed. * RECOV_HOST_PINGING Set while there are pinging call-backs scheduled * RECOV_HOST_BOOTING Set while there are pinging call-backs scheduled * * RECOV_WAITING artificial state to trace Rpc_WaitForHost * RECOV_CRASH artificial state to trace RpcCrashCallBacks * RECOV_REBOOT artificial state to trace RpcRebootCallBacks */ #define RECOV_STATE_UNKNOWN 0x0 #define RECOV_HOST_ALIVE 0x1 #define RECOV_HOST_DEAD 0x2 #define RECOV_CRASH_CALLBACKS 0x0100 #define RECOV_HOST_PINGING 0x0200 #define RECOV_HOST_BOOTING 0x0400 #define RECOV_WAITING 0x4 #define RECOV_CRASH 0x8 #define RECOV_REBOOT 0x10 /* * A host is "pinged" (to see when it reboots) at an interval determined by * rpcPingSeconds. */ int rpcPingSeconds = 30; /* * After a host reboots we pause a bit before attempting recovery. This * allows a host to complete boot-time start up. If we don't pause the * ping done by the recovery call backs may fail and we may erroneously * think that the other guy crashed right away. */ int rpcRecoveryPause = 30; /* Seconds */ /* * Other kernel modules can arrange call-backs when a host reboots. * The following list structure is used to keep these. The calling * sequence of the callback is as follows: * (*proc)(spriteID, clientData, when) * where 'when' is RPC_WHEN_HOST_DOWN or RPC_WHEN_HOST_REBOOTS (never both). */ typedef struct { List_Links links; void (*proc)(); int flags; /* RPC_WHEN_HOST_DOWN, RPC_WHEN_HOST_REBOOTS */ ClientData clientData; } NotifyElement; List_Links rpcNotifyList; /* * A trace is kept for debugging/understanding the host state transisions. */ typedef struct RpcRecovTraceRecord { int spriteID; /* Host ID whose state changed */ int state; /* Their new state */ } RpcRecovTraceRecord; /* * Tracing events, these describe the trace record. Note that some * trace types are defined in rpc.h for use with Rpc_HostTrace. * * RECOV_CUZ_WAIT Wait in Rpc_WaitForHost * RECOV_CUZ_WAKEUP Wakeup in Rpc_WaitForHost * RECOV_CUZ_INIT First time we were interested in the host * RECOV_CUZ_REBOOT We detected a reboot * RECOV_CUZ_CRASH We detected a crash * RECOV_CUZ_DONE Recovery actions completed * RECOV_CUZ_PING_CHK We are pinging the host to check it out * RECOV_CUZ_PING_ASK We are pinging the host because we were asked */ #define RECOV_CUZ_WAIT 0x1 #define RECOV_CUZ_WAKEUP 0x2 #define RECOV_CUZ_INIT 0x4 #define RECOV_CUZ_REBOOT 0x8 #define RECOV_CUZ_CRASH 0x10 #define RECOV_CUZ_DONE 0x20 #define RECOV_CUZ_PING_CHK 0x40 #define RECOV_CUZ_PING_ASK 0x80 Trace_Header rpcRecovTraceHdr; Trace_Header *rpcRecovTraceHdrPtr = &rpcRecovTraceHdr; int rpcRecovTraceLength = 50; Boolean rpcRecovTracing = TRUE; #ifndef CLEAN #define RECOV_TRACE(zspriteID, zstate, event) \ if (rpcRecovTracing) {\ RpcRecovTraceRecord rec;\ rec.spriteID = zspriteID;\ rec.state = zstate;\ Trace_Insert(rpcRecovTraceHdrPtr, event, &rec);\ } #else #define RECOV_TRACE(zspriteID, zstate, event) #endif not CLEAN /* * Forward declarations. */ void RpcRebootCallBacks(); void RpcCrashCallBacks(); void MarkRecoveryComplete(); int GetHostState(); void StartPinging(); void CheckHost(); void StopPinging(); /* *---------------------------------------------------------------------- * * RpcInitRecovery -- * * Set up the data structures used by the RpcRecovery module. * * Results: * None. * * Side effects: * None. * *---------------------------------------------------------------------- */ void RpcInitRecovery() { Hash_Init(recovHashTable, 8, HASH_ONE_WORD_KEYS); List_Init(&rpcNotifyList); Trace_Init(rpcRecovTraceHdrPtr, rpcRecovTraceLength, sizeof(RpcRecovTraceRecord), 0); } /* *---------------------------------------------------------------------- * * Rpc_HostNotify -- * * Add a call-back for other modules to use when a host crashes/reboots. * The 'when' parameter specifies when to callback the client procedure. * If RPC_WHEN_HOST_DOWN then the procedure is called when the RPC * module has gotten a timeout trying to reach the host. If it is * RPC_WHEN_HOST_REBOOTS then the call-back is made when the RPC * module detects a reboot due to the bootID changing. If both * are specified then the call-back is made at both times. * * Results: * None. * * Side effects: * Entry added to notify list. * *---------------------------------------------------------------------- */ void Rpc_HostNotify(proc, clientData, when) void (*proc)(); ClientData clientData; int when; /* RPC_WHEN_HOST_DOWN, RPC_WHEN_HOST_REBOOTS */ { register NotifyElement *notifyPtr; notifyPtr = (NotifyElement *) Mem_Alloc(sizeof(NotifyElement)); notifyPtr->proc = proc; notifyPtr->clientData = clientData; notifyPtr->flags = when; List_InitElement((List_Links *) notifyPtr); List_Insert((List_Links *) notifyPtr, LIST_ATREAR(&rpcNotifyList)); } /* *---------------------------------------------------------------------- * * Rpc_HostIsDown -- * * This decides if the specified host is down, and will make sure * that the host is being "pinged" if the caller wants to find * out (via the callbacks setup in Rpc_HostNotify) when the host * comes back to life. If the host is known to be down this routine * returns TRUE and makes sure pinging is initiated (if needed). * Otherwise, if there hasn't been recent message traffic * (within the last 10 seconds) then this will ping the host to find * out if it's still up. There are two cases then, the host isn't * up, or it is booting but it's RPC service is not ready yet. * We return FALSE so that our caller doesn't think the host * has crashed * * Results: * SUCCESS if the host is up, FAILURE if it doesn't respond to * pings or is known to be down, and RPC_SERVICE_DISABLED if * the host says so. * * Side effects: * May do a ping. If the 'ping' parameter is TRUE this will make * sure that pinging is in progress if the host is down. * *---------------------------------------------------------------------- */ ReturnStatus Rpc_HostIsDown(spriteID, ping) int spriteID; Boolean ping; /* If TRUE, we make sure the host is being pinged * if it is down now */ { register ReturnStatus status = SUCCESS; if (spriteID == NET_BROADCAST_HOSTID) { Sys_Panic(SYS_WARNING, "Rpc_HostIsDown, got broadcast address\n"); return(SUCCESS); } switch (GetHostState(spriteID)) { case RECOV_STATE_UNKNOWN: RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_PING_ASK); status = Rpc_Ping(spriteID); break; case RECOV_HOST_ALIVE: status = SUCCESS; break; case RECOV_HOST_DEAD: status = FAILURE; break; } if (status != SUCCESS && ping) { StartPinging(spriteID); } return(status); } /* *---------------------------------------------------------------------- * * Rpc_WaitForHost -- * * Block the current process (at an interruptable priority) until * the given host comes back up. This is used when retrying * filesystem operations when a fileserver goes down, for example. * * Results: * TRUE if the wait was interrupted. * * Side effects: * The current process is blocked * until messages from the host indicate it is up. * *---------------------------------------------------------------------- */ Boolean Rpc_WaitForHost(spriteID) int spriteID; /* Host to monitor */ { /* * Set up the hosts state (dead or alive) by pinging it. * If it's down we drop into a monitored routine to do * the actual waiting. It will check again to make sure * we don't sleep on an alive host. */ if (Rpc_HostIsDown(spriteID, TRUE) == FAILURE) { RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, RECOV_CUZ_WAIT); return(RpcWaitForHostInt(spriteID)); } else { return(FALSE); } } /* *---------------------------------------------------------------------- * * RpcWaitForHostInt -- * * Block the current process (at an interruptable priority) until * the given host comes back up. Our caller should have already * probed to host with Rpc_HostIsDown so that pinging is already * initiated. * * Results: * TRUE is the wait was interrupted by a signal. * * Side effects: * If the host is thought down, the current process is blocked * until messages from the host indicate it is up. * *---------------------------------------------------------------------- */ ENTRY Boolean RpcWaitForHostInt(spriteID) int spriteID; /* Host to monitor */ { Hash_Entry *hashPtr; RecovHostState *hostPtr; Boolean sigPending = FALSE; LOCK_MONITOR; if (spriteID <= 0 || spriteID == rpc_SpriteID) { Sys_Panic(SYS_FATAL, "RpcWaitForHostInt, bad hostID %d\n", spriteID); UNLOCK_MONITOR; return(FALSE); } hashPtr = Hash_Find(recovHashTable, spriteID); if (hashPtr->value == (Address)NIL) { Sys_Panic(SYS_FATAL, "RpcWaitForHostInt, no host state\n"); UNLOCK_MONITOR; return; } else { hostPtr = (RecovHostState *)hashPtr->value; } while (!sigPending && (hostPtr->state & RECOV_HOST_DEAD)) { sigPending = Sync_Wait(&hostPtr->alive, TRUE); } RECOV_TRACE(hostPtr->spriteID, hostPtr->state, RECOV_CUZ_WAKEUP); UNLOCK_MONITOR; return(sigPending); } /* *---------------------------------------------------------------------- * * Rpc_HostPrint -- * * Print out a statement concerning a host. This maps to a * string hostname if possible, and prints out the message. * The strings come from netAddresses.c (but should be imported * via Net_RouteInstall system call). * * MOVE THIS TO NET * * Results: * None. * * Side effects: * Sys_Printf. * *---------------------------------------------------------------------- */ void Rpc_HostPrint(spriteID, string) int spriteID; char *string; { char *hostName; Net_SpriteIDToName(spriteID, &hostName); if (hostName == (char *)NIL) { Sys_Printf("Sprite Host <%d> %s\n", spriteID, string); } else { Sys_Printf("Sprite Host %s (%d) %s\n", hostName, spriteID, string); } } /* *---------------------------------------------------------------------- * * Rpc_HostTrace -- * * Add an entry to the rpc recovery trace. * * Results: * None. * * Side effects: * None. * *---------------------------------------------------------------------- */ ENTRY void Rpc_HostTrace(spriteID, event) int spriteID; int event; { LOCK_MONITOR; RECOV_TRACE(spriteID, RECOV_STATE_UNKNOWN, event); UNLOCK_MONITOR; } /* *---------------------------------------------------------------------- * * Rpc_HostGetState -- * * Return the client state associated with a host. The recovery host * table is a convenient object keyed on spriteID. Other modules can * set their own state in the table (beyond the simple up/down state * mainted by the rest of this module), and retrieve it with this call. * * Results: * A copy of the clientState field. 0 is returned if there is no * host table entry. * * Side effects: * None. * *---------------------------------------------------------------------- */ ENTRY int Rpc_HostGetState(spriteID) int spriteID; { Hash_Entry *hashPtr; RecovHostState *hostPtr; int result = 0; LOCK_MONITOR; hashPtr = Hash_LookOnly(recovHashTable, spriteID); if (hashPtr != (Hash_Entry *)NIL) { hostPtr = (RecovHostState *)hashPtr->value; if (hostPtr != (RecovHostState *)NIL) { result = hostPtr->clientState; } } UNLOCK_MONITOR; return(result); } /* *---------------------------------------------------------------------- * * Rpc_HostSetState -- * * Set the client state associated with a host. This completely * overwrites the previous value of the client state. * * Results: * None. * * Side effects: * Sets the clientState field of the host state. This will add an * entry to the host table if one doesn't alreay exist. Its RPC * up/down state is set to "unknown" in this case. * *---------------------------------------------------------------------- */ ENTRY void Rpc_HostSetState(spriteID, state) int spriteID; int state; { Hash_Entry *hashPtr; RecovHostState *hostPtr; LOCK_MONITOR; hashPtr = Hash_Find(recovHashTable, spriteID); hostPtr = (RecovHostState *)hashPtr->value; if (hostPtr == (RecovHostState *)NIL) { hostPtr = Mem_New(RecovHostState); hashPtr->value = (Address)hostPtr; Byte_Zero(sizeof(RecovHostState), (Address)hostPtr); hostPtr->state = RECOV_STATE_UNKNOWN; hostPtr->spriteID = spriteID; } hostPtr->clientState = state; UNLOCK_MONITOR; } /* *---------------------------------------------------------------------- * * RpcHostAlive -- * * Mark the host as being alive. This is called when we've received * a message from the host. It uses state from the host table and * the bootID parameter to detect reboots. If a reboot is detected * but we thought the host was up then the Crash call-backs are invoked. * In any case, a reboot invokes the Reboot call-backs. (Call-backs * are installed with Rpc_HostNotify.) Finally, a time stamp is * kept so we can check when we last got a message from a host. * * This procedure is called from client RPC upon successful completion * of an RPC, and by server RPC upon reciept of a client request. * These two cases are identified by the 'asyncRecovery' parameter. * Servers want synchronous recovery so they don't service anything * until state associated with that client has been cleaned up via * the Crash call-backs. So RpcHostAlive blocks (if !asyncRecovery) * until the crash call-backs are complete. Clients don't have the * same worries so they let the crash call-backs complete in the * background (asyncRecovery is TRUE). * * Results: * None. * * Side effects: * Updates the boot timestamp of the other host. Procedures installed * with Rpc_HostNotify are called when the bootID changes. A timestamp * of when this message was received is obtained from the "cheap" clock * so we can tell later if there has been recent message traffic. * *---------------------------------------------------------------------- */ ENTRY void RpcHostAlive(spriteID, bootID, asyncRecovery) int spriteID; /* Host ID of the message sender */ int bootID; /* Boot time stamp from message header */ Boolean asyncRecovery; /* TRUE means do recovery call-backs in * the background. FALSE causes the process * to wait until crash recovery is complete. */ { register Hash_Entry *hashPtr; register RecovHostState *hostPtr; Boolean reboot = FALSE; /* Used to control print statements at reboot */ LOCK_MONITOR; if (spriteID == NET_BROADCAST_HOSTID || bootID == 0) { /* * Don't track the broadcast address. Also ignore zero valued * bootIDs. These come from hosts at early boot time, or * in certain error conditions like trying to send too much * data in a single RPC. */ UNLOCK_MONITOR; return; } hashPtr = Hash_Find(recovHashTable, spriteID); if (hashPtr->value == (Address)NIL) { /* * Initialize the host's state. This is the first time we've talked * to it since we've been up, so take no action. */ hostPtr = Mem_New(RecovHostState); hashPtr->value = (Address)hostPtr; Byte_Zero(sizeof(RecovHostState), (Address)hostPtr); hostPtr->state = RECOV_HOST_ALIVE; hostPtr->spriteID = spriteID; hostPtr->bootID = bootID; Rpc_HostPrint(spriteID, "is up"); RECOV_TRACE(spriteID, RECOV_HOST_ALIVE, RECOV_CUZ_INIT); } else { hostPtr = (RecovHostState *)hashPtr->value; } if (hostPtr != (RecovHostState *)NIL) { /* * Have to read the clock in order to suppress repeated pings, * see GetHostState and Rpc_HostIsDown. */ Timer_GetTimeOfDay(&hostPtr->time, (int *)NIL, (Boolean *)NIL); /* * Check for a rebooted peer by comparing boot time stamps. * The first process to detect this initiates recovery actions. */ if (hostPtr->bootID != bootID) { Rpc_HostPrint(spriteID, "rebooted"); hostPtr->bootID = bootID; reboot = TRUE; RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_REBOOT); if (hostPtr->state & RECOV_HOST_ALIVE) { /* * A crash occured un-detected. We do the crash call-backs * first, and block server processes in the meantime. * RECOV_CRASH_CALLBACKS flag is reset by RpcCrashCallBacks. * The host is marked dead here so we fall into the * switch below and call the reboot callbacks. */ RECOV_TRACE(spriteID, RECOV_CRASH, RECOV_CUZ_REBOOT); hostPtr->state &= ~RECOV_HOST_ALIVE; hostPtr->state |= (RECOV_HOST_DEAD | RECOV_CRASH_CALLBACKS); Proc_CallFunc(RpcCrashCallBacks, spriteID, 0); } } /* * Block servers until crash recovery actions complete. * Servers are synchronous with respect to reboot recovery. * This blocks requests from clients until after the * recovery actions complete. */ if (! asyncRecovery) { while (hostPtr->state & RECOV_CRASH_CALLBACKS) { Sync_Wait(&hostPtr->recovery, FALSE); if (sys_ShuttingDown) { Sys_Printf("Warning, Server exiting from RpcHostAlive\n"); Proc_Exit(1); } } } /* * Now that we've taken care of crash recovery, we see if the host * is newly up. If so, invoke the reboot call-backs and then notify * waiting processes. This means clientA (us) may start * re-opening files from serverB (the other guy) at the same time * as clientA (us) is closing files that serverB had had open. * ie. both the crash and reboot call backs may proceed in parallel. */ switch(hostPtr->state & ~(RECOV_CRASH_CALLBACKS|RECOV_HOST_PINGING)) { case RECOV_HOST_ALIVE: /* * Host already alive. */ break; case RECOV_HOST_DEAD: { register int wait; /* * Notify interested parties that the host is up. If the host * has done a full reboot we wait a bit before pounding on * it with our re-open requests. This gives it a chance * to create RPC server processes, etc. so we don't think * it crashed because we tried to talk to it too soon. */ if ( !reboot ) { Rpc_HostPrint(spriteID, "is back again"); wait = 0; } else { wait = timer_IntOneSecond * rpcRecoveryPause; } hostPtr->state &= ~RECOV_HOST_DEAD; hostPtr->state |= RECOV_HOST_ALIVE; Sync_Broadcast(&hostPtr->alive) Proc_CallFunc(RpcRebootCallBacks, spriteID, wait); break; default: Sys_Panic(SYS_WARNING, "Unexpected state <%x> for ", hostPtr->state); Rpc_HostPrint(spriteID, ""); break; } } } UNLOCK_MONITOR; } /* *---------------------------------------------------------------------- * * RpcHostDead -- * * Change the host's state to "dead". This is called from client RPC * when an RPC timed out with no response. It is also called by the * Rpc_Daemon when it can't recontact a client to get an explicit * acknowledgment. * * Results: * None. * * Side effects: * Sets the state in the host state table to dead. Pings are not * initiated here because we may or may not be interested in * the other host. See Rpc_HostIsDown. * *---------------------------------------------------------------------- */ ENTRY void RpcHostDead(spriteID) int spriteID; { register Hash_Entry *hashPtr; register RecovHostState *hostPtr; LOCK_MONITOR; if (spriteID == NET_BROADCAST_HOSTID || rpc_NoTimeouts) { /* * If rpcNoTimeouts is set the Rpc_Daemon may still call us if * it can't get an acknowledgment from a host to close down * a connection. We ignore this so that we don't take action * against the offending host (who is probably in the debugger) */ UNLOCK_MONITOR; return; } hashPtr = Hash_LookOnly(recovHashTable, spriteID); if (hashPtr != (Hash_Entry *)NIL) { hostPtr = (RecovHostState *)hashPtr->value; if (hostPtr != (RecovHostState *)NIL) { switch(hostPtr->state & ~(RECOV_CRASH_CALLBACKS| RECOV_HOST_PINGING)) { case RECOV_HOST_DEAD: /* * Host already dead. */ break; case RECOV_STATE_UNKNOWN: case RECOV_HOST_ALIVE: hostPtr->state &= ~(RECOV_HOST_ALIVE|RECOV_STATE_UNKNOWN); hostPtr->state |= RECOV_HOST_DEAD|RECOV_CRASH_CALLBACKS; Rpc_HostPrint(spriteID, "is down"); RECOV_TRACE(spriteID, hostPtr->state, RECOV_CUZ_CRASH); Proc_CallFunc(RpcCrashCallBacks, spriteID, 0); break; } } } UNLOCK_MONITOR; } /* *---------------------------------------------------------------------- * * RpcRebootCallBacks -- * * This calls the call-back procedures installed by other modules * via Rpc_HostNotify. It is invoked asynchronously from RpcHostAlive * when that procedure detects a reboot. It does an explict ping * of the other host to make sure it is ready for our recovery actions. * This will reschedule itself for later if the host isn't ready. * * Results: * None. * * Side effects: * Invoke the call-backs. * *---------------------------------------------------------------------- */ void RpcRebootCallBacks(data, callInfoPtr) ClientData data; Proc_CallInfo *callInfoPtr; { ReturnStatus status; register NotifyElement *notifyPtr; register int spriteID = (int)data; status = Rpc_Ping(spriteID); switch(status) { case RPC_SERVICE_DISABLED: Rpc_HostPrint(spriteID, "still booting"); callInfoPtr->interval = rpcRecoveryPause * timer_IntOneSecond; break; case RPC_TIMEOUT: Rpc_HostPrint(spriteID, "not responding"); callInfoPtr->interval = rpcRecoveryPause * timer_IntOneSecond; break; case SUCCESS: LIST_FORALL(&rpcNotifyList, (List_Links *)notifyPtr) { if (notifyPtr->flags & RPC_WHEN_HOST_REBOOTS) { (*notifyPtr->proc)(spriteID, notifyPtr->clientData, RPC_WHEN_HOST_REBOOTS); } } RECOV_TRACE(spriteID, RECOV_REBOOT, RECOV_CUZ_DONE); callInfoPtr->interval = 0; /* Don't call again */ break; } } /* *---------------------------------------------------------------------- * * RpcCrashCallBacks -- * * Invoked asynchronously from RpcHostDead so that other modules * can clean up behind the crashed host. When done the host * is marked as having recovery complete. This unblocks server * processes stalled in RpcHostAlive. * * Results: * None. * * Side effects: * Invoke the call-backs with the RPC_WHEN_HOST_DOWN flag. * Clears the recovery in progress flag checked in RpcHostAlive. * *---------------------------------------------------------------------- */ void RpcCrashCallBacks(data, callInfoPtr) ClientData data; Proc_CallInfo *callInfoPtr; { register NotifyElement *notifyPtr; register int spriteID = (int)data; LIST_FORALL(&rpcNotifyList, (List_Links *)notifyPtr) { if (notifyPtr->flags & RPC_WHEN_HOST_DOWN) { (*notifyPtr->proc)(spriteID, notifyPtr->clientData, RPC_WHEN_HOST_DOWN); } } MarkRecoveryComplete(spriteID); RECOV_TRACE(spriteID, RECOV_CRASH, RECOV_CUZ_DONE); callInfoPtr->interval = 0; /* Don't call again */ } /* *---------------------------------------------------------------------- * * MarkRecoveryComplete -- * * The recovery call-backs have completed, and this procedure's * job is to mark that fact in the host hash table and to notify * any processes that are blocked in RpcHostAlive waiting for this. * * Results: * None. * * Side effects: * Sets the state, if any, in the host state table. * Notifies the hostPtr->recovery condition * *---------------------------------------------------------------------- */ ENTRY static void MarkRecoveryComplete(spriteID) { register Hash_Entry *hashPtr; register RecovHostState *hostPtr; LOCK_MONITOR; hashPtr = Hash_LookOnly(recovHashTable, spriteID); if (hashPtr != (Hash_Entry *)NIL) { hostPtr = (RecovHostState *)hashPtr->value; if (hostPtr != (RecovHostState *)NIL) { hostPtr->state &= ~RECOV_CRASH_CALLBACKS; Sync_Broadcast(&hostPtr->recovery); } } UNLOCK_MONITOR; } /* *---------------------------------------------------------------------- * * GetHostState -- * * This looks into the host table to see and provides a guess * as to the host's current state. It uses a timestamp kept in * the host state to see if there's been recent message traffic. * If so, RECOV_HOST_ALIVE is returned. If not, RECOV_STATE_UNKNOWN * is returned and the caller should ping to make sure. Finally, * if it is known that the host is down already, then RECOV_HOST_DEAD * is returned. * * Results: * RECOV_STATE_UNKNOWN if the caller should ping to make sure. * RECOV_HOST_ALIVE if the host is up (recent message traffic). * RECOV_HOST_DEAD if the host is down (recent timeouts). * * Side effects: * None. * *---------------------------------------------------------------------- */ ENTRY int GetHostState(spriteID) int spriteID; { register Hash_Entry *hashPtr; register RecovHostState *hostPtr; register int state = RECOV_STATE_UNKNOWN; Time time; LOCK_MONITOR; hashPtr = Hash_LookOnly(recovHashTable, spriteID); if (hashPtr != (Hash_Entry *)NIL) { hostPtr = (RecovHostState *)hashPtr->value; if (hostPtr != (RecovHostState *)NIL) { state = hostPtr->state & ~(RECOV_CRASH_CALLBACKS|RECOV_HOST_PINGING); if (state == RECOV_HOST_ALIVE) { /* * Check for recent message traffic before admitting * that the other machine is up. */ Timer_GetTimeOfDay(&time, (int *)NIL, (Boolean *)NIL); Time_Subtract(time, hostPtr->time, &time); if (Time_GT(time, time_TenSeconds)) { state = RECOV_STATE_UNKNOWN; } } } } UNLOCK_MONITOR; return(state); } /* *---------------------------------------------------------------------- * * StartPinging -- * * Make sure there is a background pinging process for the host. * The state bit used to indicate pinging is reset by RpcHostCheck * after it finally gets in a good ping. * * Results: * None. * * Side effects: * Starts the pinging callback if not already in progress. * *---------------------------------------------------------------------- */ ENTRY void StartPinging(spriteID) int spriteID; { register Hash_Entry *hashPtr; register RecovHostState *hostPtr; LOCK_MONITOR; hashPtr = Hash_LookOnly(recovHashTable, spriteID); hostPtr = (RecovHostState *)hashPtr->value; if ((hostPtr->state & RECOV_HOST_PINGING) == 0) { hostPtr->state |= RECOV_HOST_PINGING; Proc_CallFunc(CheckHost, spriteID, 0); } UNLOCK_MONITOR; } /* *---------------------------------------------------------------------- * * CheckHost -- * * This is the call back setup when a host is detected as crashed * and we want to find out when it comes back up. This pings * the remote host if it's down or there hasn't been recent traffic. * A side effect of a successful ping is a call to RpcHostAlive which * triggers the recovery actions. * * Results: * None. * * Side effects: * This will pings the host unless there has been recent message * traffic. It reschedules itself if the ping fails. * *---------------------------------------------------------------------- */ static void CheckHost(data, callInfoPtr) ClientData data; Proc_CallInfo *callInfoPtr; { register int spriteID = (int)data; register int state; ReturnStatus status = SUCCESS; state = GetHostState(spriteID); switch (state) { case RECOV_HOST_DEAD: case RECOV_STATE_UNKNOWN: RECOV_TRACE(spriteID, state, RECOV_CUZ_PING_CHK); status = Rpc_Ping(spriteID); break; case RECOV_HOST_ALIVE: break; } if (status != SUCCESS) { /* * Try again later if the host is still down. */ callInfoPtr->interval = rpcPingSeconds * timer_IntOneSecond; } else { StopPinging(spriteID); callInfoPtr->interval = 0; } } /* *---------------------------------------------------------------------- * * StartPinging -- * * Make sure there is a background pinging process for the host. * The state bit used to indicate pinging is reset by RpcHostCheck * after it finally gets in a good ping. * * Results: * None. * * Side effects: * Starts the pinging callback if not already in progress. * *---------------------------------------------------------------------- */ ENTRY void StopPinging(spriteID) int spriteID; { register Hash_Entry *hashPtr; register RecovHostState *hostPtr; LOCK_MONITOR; hashPtr = Hash_LookOnly(recovHashTable, spriteID); hostPtr = (RecovHostState *)hashPtr->value; if ((hostPtr->state & RECOV_HOST_PINGING) == 0) { Sys_Panic(SYS_WARNING, "StopPinging found bad state\n"); } hostPtr->state &= ~RECOV_HOST_PINGING; UNLOCK_MONITOR; } /* *---------------------------------------------------------------------- * * Rpc_PrintRecovTraceRecord -- * * Format and print the client data part of a recovery trace record. * * Results: * None. * * Side effects: * Sys_Printf to the display. * *---------------------------------------------------------------------- */ int Rpc_PrintRecovTraceRecord(clientData, event, printHeaderFlag) ClientData clientData; /* Client data in the trace record */ int event; /* Type, or event, from the trace record */ Boolean printHeaderFlag; /* If TRUE, a header line is printed */ { RpcRecovTraceRecord *recPtr = (RpcRecovTraceRecord *)clientData; char *name; if (printHeaderFlag) { /* * Print column headers and a newline. */ Sys_Printf("%10s %10s %17s\n", "Host", "State", "Event "); } if (clientData != (ClientData)NIL) { Net_SpriteIDToName(recPtr->spriteID, &name); if (name == (char *)NIL) { Sys_Printf("%10d ", recPtr->spriteID); } else { Sys_Printf("%10s ", name); } switch(recPtr->state & ~(RECOV_CRASH_CALLBACKS|RECOV_HOST_PINGING)) { case RECOV_STATE_UNKNOWN: Sys_Printf("%-8s", "Unknown"); break; case RECOV_HOST_ALIVE: Sys_Printf("%-8s ", "Alive"); break; case RECOV_HOST_DEAD: Sys_Printf("%-8s ", "Dead"); break; case RECOV_WAITING: Sys_Printf("%-8s ", "Waiting"); break; case RECOV_CRASH: Sys_Printf("%-8s ", "Crash callbacks"); break; case RECOV_REBOOT: Sys_Printf("%-8s ", "Reboot callbacks"); break; } Sys_Printf("%3s", (recPtr->state & RECOV_CRASH_CALLBACKS) ? " C " : " "); Sys_Printf("%3s", (recPtr->state & RECOV_HOST_PINGING) ? " P " : " "); switch(event) { case RECOV_CUZ_WAIT: Sys_Printf("waiting"); break; case RECOV_CUZ_WAKEUP: Sys_Printf("wakeup"); break; case RECOV_CUZ_INIT: Sys_Printf("init"); break; case RECOV_CUZ_REBOOT: Sys_Printf("reboot"); break; case RECOV_CUZ_CRASH: Sys_Printf("crash"); break; case RECOV_CUZ_DONE: Sys_Printf("done"); break; case RECOV_CUZ_PING_ASK: Sys_Printf("ping (ask)"); break; case RECOV_CUZ_PING_CHK: Sys_Printf("ping (check)"); break; case RPC_RECOV_TRACE_STALE: Sys_Printf("stale FS handle"); break; default: Sys_Printf("(%x)", event); break; } /* Our caller prints a newline */ } } /* *---------------------------------------------------------------------- * * Rpc_PrintRecovTrace -- * * Dump out the recovery trace. Called via a console L1 keystroke. * * Results: * None. * * Side effects: * Prints to the console. * *---------------------------------------------------------------------- */ void Rpc_PrintRecovTrace(numRecs) int numRecs; { if (numRecs <= 0 || numRecs > rpcRecovTraceLength) { numRecs = rpcRecovTraceLength; } Sys_Printf("RECOVERY TRACE\n"); Trace_Print(rpcRecovTraceHdrPtr, numRecs, Rpc_PrintRecovTraceRecord); }